# Load required packages and data
library(tidyverse)
library(pander)
library(fitdistrplus)
library(AER)
dat <- readRDS("t_test_data_sbs_pt01_10sub_10clust_10000sims.rds")
pval_b1_model <- unlist(dat[1,])
est_sb2 <- unlist(dat[2,]) # let's assume anything less than 1e-6 is the weird ones
tiny_est <- est_sb2 < 1e-6
resid_var_model <- unlist(dat[3,])
MSB <- unlist(dat[5,])
MSE <- unlist(dat[6,])
clustvar0 <- unlist(dat[7,])
clustvar1 <- unlist(dat[8,])
ftest_pval <- unlist(dat[9,])
pval_tt_eq_var <- unlist(dat[13,])
pval_tt_uneq_var <- unlist(dat[14,])
df_tt_uneq_var <- unlist(dat[15,])
beta0 <- unlist(dat[17,])
beta0_se <- unlist(dat[18,])
beta1 <- unlist(dat[19,])
beta1_se <- unlist(dat[20,])
plotdat <- cbind.data.frame(tiny_est, pval_b1_model, est_sb2, resid_var_model, #modelNLME,
MSB, MSE,
clustvar0, clustvar1, ftest_pval,# ttest_eq_var, ttest_uneq_var,
#clustmeans,
pval_tt_eq_var, pval_tt_uneq_var, df_tt_uneq_var,
beta0, beta0_se, beta1, beta1_se)#, test)
plotdat$icc <- MSB / (MSB + MSE)
Gathering data on how a ‘naive’ NLME model estimates those parameters, as well as what equal- and unequal-variance t-tests show.
Mean square within clusters: \[ MSE = \frac{1}{k(n-1)} \sum_i \sum_j (y_{ij} - \bar{y}_{i.} )^2 \] Mean square between clusters: \[ MSB = \frac{1}{k-1} \sum_i \sum_j (\bar{y}_{i.} - \bar{y}_{..} )^2 \]
I looked at a lot of relationships between variables, and the ratio \(MSB / MSE\) was the one that popped out as a clear predictor of a tiny estimate.
cutoff_MSBMSE <- min(MSB[tiny_est==F] / MSE[tiny_est==F])
cutoff_ICC <- min(plotdat$icc[tiny_est==F])
ggplot(data = plotdat) +
geom_histogram(aes(x = MSB/MSE), color = "black", fill = "grey") +
labs(title = "Ratio around .1 highly determinative of a tiny estimate") +
facet_grid(rows = vars(tiny_est), labeller = label_both)
ggplot(data = plotdat) +
geom_histogram(aes(x = MSB), color = "black", fill = "grey") +
labs(title = "MSB alone is less predictive") +
facet_grid(rows = vars(tiny_est), labeller = label_both)
ggplot(data = plotdat) +
geom_point(aes(x = MSB, y = MSE, color = tiny_est), alpha = .2, stroke = 0) +
labs(title = "Ratio around .1 highly determinative of a tiny estimate")
ggplot(data = plotdat) +
geom_point(aes(x = MSB, y = MSE, color = tiny_est), alpha = .2, stroke = 0) + geom_abline(aes(slope = 1/cutoff_MSBMSE, intercept = 0)) +
labs(title = "More variability in that finding among the tiny estimates") +
facet_grid(rows = vars(tiny_est), labeller = label_both)
ggplot(data = plotdat) +
labs(title = "No change in the pattern as MSE varies") +
geom_point(aes(x = MSE, y = MSB/MSE, color = tiny_est), alpha = .2, stroke = 0) + geom_hline(aes(yintercept = cutoff_MSBMSE))
ggplot(data = plotdat) +
labs(title = "No change in the pattern as MSB varies") +
geom_point(aes(x = MSB, y = MSB/MSE, color = tiny_est), alpha = .2, stroke = 0) + geom_hline(aes(yintercept = cutoff_MSBMSE))
ggplot(data = plotdat) +
geom_point(aes(x = MSB/MSE, y = clustvar0/clustvar1, color = tiny_est), alpha = .3, stroke = 0) + ylim(c(0,3)) +
labs(title = "Pattern seems invariant to ratio of between-cluster variances")
ggplot(data = plotdat) +
geom_point(aes(x = MSB/MSE, y = clustvar0 - clustvar1, color = tiny_est), alpha = .3, stroke = 0) +
labs(title = "Pattern seems invariant to difference in between-cluster variances")
ggplot(data = plotdat) +
geom_point(aes(x = MSE, y = icc, color = tiny_est), alpha = .2, stroke = 0) +
labs(title = "Using the ICC instead of the ratio might also work, results are the same") +
geom_hline(aes(yintercept = cutoff_ICC)) +
facet_grid(rows = vars(tiny_est), labeller = label_both)
ggplot(data = plotdat) +
geom_point(aes(x = MSE, y = MSB/MSE, color = tiny_est), alpha = .2, stroke = 0) +
labs(title = "Using the ratio") +
geom_hline(aes(yintercept = cutoff_MSBMSE)) +
facet_grid(rows = vars(tiny_est), labeller = label_both)
sum((MSB[tiny_est==T] / MSE[tiny_est==T]) > cutoff_MSBMSE)
## [1] 628
sum((plotdat$icc[tiny_est==T]) > cutoff_ICC)
## [1] 628
ggplot(data = plotdat) +
geom_histogram(aes(x = pval_tt_uneq_var), color = "black", fill = "grey") +
facet_grid(rows = vars(tiny_est), labeller = label_both) +
xlab("p-value for welch t-test, allows unequal variances (results the same with eq vars)") +
labs(title = "Small p-values on a t-test correlated with tiny estimates; makes sense, more power")
What to make of this? Just a signal-to-noise issue?
temp <- cbind.data.frame(MSB, tiny_est)
#3770, 3211 is an outlier...
plotdat[3770,]
plotdat[3211,]
# Actually look at data spread in some non-outlier cases in both sets (tiny / not tiny)
datasets <- dat[16,]
ggplot(data = datasets[[3770]]) + geom_point(aes(x = clustid, y = linpred))
ggplot(data = datasets[[3211]]) + geom_point(aes(x = clustid, y = linpred))
ggplot(data = datasets[[3212]]) + geom_point(aes(x = clustid, y = linpred)) + labs(title = "An unsurprising one")
j <- seq(from = .098, to = .102, by = .0005)
y <- vector()
for(i in j){
y <- c(y, sum(ratio[tiny_est == T] > i) / length(ratio[tiny_est == T]) + sum(ratio[tiny_est == F] < i) / length(ratio[tiny_est == F]))
}
cbind.data.frame(j, y)
tdat <- cbind.data.frame(est_sb2)
tmod <- AER::tobit(est_sb2 ~ 1, left = 1e-6, data = tdat) # roughly log(-12)
# ggplot(data = as.data.frame(est_sb2)) +
# geom_histogram(aes(x = est_sb2, y = ..density..), color = "black", fill = "grey", binwidth = .006) +
# geom_vline(aes(xintercept = summary(tmod)$coefficients[1,1]), linetype = "dotdash", color = "red") +
# geom_vline(aes(xintercept = sb2), color = "black") +
# stat_function(fun = dnorm, args = list(mean = summary(tmod)$coefficients[1,1], sd = summary(tmod)$scale), color = "red") +
# labs(title = "Tobit looks probable; note that mean of Tobit is lower than true value")
ggplot(data = as.data.frame(est_sb2)) +
geom_histogram(aes(x = est_sb2, y = ..density..), color = "black", fill = "grey", binwidth = .006) +
geom_vline(aes(xintercept = summary(tmod)$coefficients[1,1]), linetype = "dotdash", color = "red") +
geom_vline(aes(xintercept = sb2), color = "black") +
stat_function(fun = dnorm, args = list(mean = summary(tmod)$coefficients[1,1], sd = summary(tmod)$scale), color = "red") +
ylim(c(0, 15))+
labs(title = "Same graph, focusing on fit of non-zero parts")